Regression exploration
1 Introduction
1.1 Nettoyage des donnƩes
On commence par importer le jeu de donnĆ©es et on vĆ©rifie si il yāa des valeurs manquantes, ce qui nāest pas le cas. On peut donc continuer avec lāanalyse des donnĆ©es en vĆ©rifiant le type des variables:
On va transformer bonus_malus en binaire et retirer les variables qui ne sont pas utiles pour la prƩdiction comme PoliId.
library(rmarkdown)
library(dplyr)
# importation des donnƩes
train <- read.csv("./data/train_set.csv", header = T, sep = ",", dec = ".")
test <- read.csv("./data/test_set.csv", header = T, sep = ",", dec = ".")
# valeurs manquantes
sum(is.na(train))## [1] 0
# On va transformer bonus_malus en binaire
train$Bonus_Malus <- ifelse(train$Bonus_Malus < 100, "Bonus", "Malus")
test$Bonus_Malus <- ifelse(test$Bonus_Malus < 100, "Bonus", "Malus")
train <- train %>%
select(-PolID)
test <- test %>%
select(-PolID)
# appercu des donnƩes
paged_table(train)On peut maintenant continuer avec lāanalyse des donnĆ©es en vĆ©rifiant le type des variables:
library(kableExtra)
variables <- classifier_variables_tab(train)
numeric_variables <- data.frame(variables_numƩriques = variables$variables_numeriques)
categorical_variables <- data.frame(variables_catƩgorielles = append(variables$variables_categorielles,
variables$variables_binaires))
# categorical_variables %>%
kable(categorical_variables) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE)| variables_catƩgorielles |
|---|
| Car_Model |
| Urban_rural_class |
| French_region |
| Bonus_Malus |
| Car_Fuel |
# numeric_variables %>%
kable(numeric_variables) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE)| variables_numƩriques |
|---|
| Claim |
| Period_Exp |
| Car_Power |
| Car_Age |
| Age |
| Inhab_density |
# On va convertir les variables catƩgorielles en facteur on Obtient alors:
variables <- classifier_variables_tab(train)
numeric_variables <- variables$variables_numeriques
categorical_variables <- append(variables$variables_categorielles, variables$variables_binaires)
# convertir les varianles catƩgorielles en factor
train[categorical_variables] <- lapply(train[categorical_variables], factor)
test[categorical_variables] <- lapply(test[categorical_variables], factor)
str(train)## 'data.frame': 542389 obs. of 11 variables:
## $ Claim : int 4 5 8 4 11 4 0 0 0 0 ...
## $ Period_Exp : num 0.56 1 0.41 0.27 0.08 0.1 0.96 0.73 0.09 0.73 ...
## $ Car_Power : int 4 7 4 5 4 4 14 10 4 5 ...
## $ Car_Age : int 4 9 12 9 13 1 25 2 12 4 ...
## $ Age : int 46 67 52 23 53 31 49 38 27 32 ...
## $ Bonus_Malus : Factor w/ 2 levels "Bonus","Malus": 1 1 1 1 1 1 1 1 1 1 ...
## $ Car_Model : Factor w/ 11 levels "B1","B10","B11",..: 9 7 1 8 1 4 2 4 10 8 ...
## $ Car_Fuel : Factor w/ 2 levels "Diesel","Regular": 1 1 2 1 2 2 2 1 2 1 ...
## $ Urban_rural_class: Factor w/ 6 levels "A","B","C","D",..: 1 5 4 5 4 5 5 3 3 3 ...
## $ Inhab_density : int 29 4762 824 6924 824 2983 5053 160 229 461 ...
## $ French_region : Factor w/ 22 levels "Alsace","Aquitaine",..: 7 21 13 12 13 17 12 20 6 6 ...
1.2 Ćtude des variables catĆ©gorielles:
1.2.1 Car Model
1.2.2 Bonus_Malus
1.2.3 Urban_rural_class
1.2.4 Car_Fuel
1.3 Ćtude des variables numĆ©riques
1.3.1 Inhab_density
plot_numeric <- function(data, variable) {
p1 <- ggplot(data, aes_string(x = variable)) + geom_histogram(aes(y = ..density..),
bins = 30, fill = "lightblue", color = "black") + geom_density(alpha = 0.2,
fill = "#FF6666") + labs(title = paste("Distribution de la variable", variable)) +
theme_bw()
p2 <- ggplot(data, aes_string(x = variable)) + geom_boxplot(fill = "lightblue",
color = "black") + labs(title = paste("Boxplot de la variable", variable)) +
theme_bw()
p3 <- ggplot(train, aes(x = .data[[variable]], y = Claim)) + geom_point(alpha = 0.6,
color = "darkorange") + labs(title = paste("Relation entre", variable, "et nombre de sinistres"),
x = variable, y = "Nombre de sinistres") + theme_minimal()
print(p1)
print(p2)
print(p3)
}
box_plot <- function(data, col) {
data$Claim <- as.factor(data$Claim)
p1 <- ggplot(data, aes(x = Claim, y = .data[[col]], fill = Claim)) + geom_boxplot() +
labs(title = paste("Distribution de", col, " par Claim"), x = "Claim", y = col) +
theme_bw()
# Histogram with 20 bins Histogram
p2 <- ggplot(data, aes(x = .data[[col]], fill = Claim)) + geom_histogram(color = "black",
bins = 20, alpha = 1) + labs(title = paste("Histogramme de", col, "par Claim"),
x = col, y = "Nombre") + theme_bw()
return(p2)
}
plot_numeric(train, "Inhab_density")## [1] 0
1.4 Analyse de la target
## Analyse des corrƩlations
Une heatmap pour visualiser les corrƩlations entre les variables numƩriques.
library(reshape2)
library(corrplot)
# Distribution des variables numƩriques
num_vars <- train[, c("Claim", "Period_Exp", "Car_Power", "Car_Age", "Age", "Inhab_density")]
corr_matrix <- cor(num_vars)
melted_cor <- melt(corr_matrix)
ggplot(data = melted_cor, aes(x = Var1, y = Var2, fill = value)) + geom_tile() +
scale_fill_gradient2(low = "red", high = "blue", mid = "white", midpoint = 0) +
labs(title = "Heatmap des corrƩlations", x = "", y = "")